{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Between Human Dignity and Security: Identifying Citizen and Elite Preferences and Concerns over Refugee Reception\n",
    "\n",
    "---\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-07-03T13:17:42.886198Z",
     "iopub.status.busy": "2022-07-03T13:17:42.884514Z",
     "iopub.status.idle": "2022-07-03T13:24:16.662775Z"
    },
    "vscode": {
     "languageId": "r"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Warning message:\n",
      "\"package 'stringr' was built under R version 4.3.3\"\n",
      "Warning message:\n",
      "\"package 'lubridate' was built under R version 4.3.3\"\n",
      "── \u001b[1mAttaching core tidyverse packages\u001b[22m ──────────────────────── tidyverse 2.0.0 ──\n",
      "\u001b[32m✔\u001b[39m \u001b[34mdplyr    \u001b[39m 1.1.2     \u001b[32m✔\u001b[39m \u001b[34mreadr    \u001b[39m 2.1.4\n",
      "\u001b[32m✔\u001b[39m \u001b[34mforcats  \u001b[39m 1.0.0     \u001b[32m✔\u001b[39m \u001b[34mstringr  \u001b[39m 1.5.1\n",
      "\u001b[32m✔\u001b[39m \u001b[34mggplot2  \u001b[39m 3.4.2     \u001b[32m✔\u001b[39m \u001b[34mtibble   \u001b[39m 3.2.1\n",
      "\u001b[32m✔\u001b[39m \u001b[34mlubridate\u001b[39m 1.9.4     \u001b[32m✔\u001b[39m \u001b[34mtidyr    \u001b[39m 1.3.0\n",
      "\u001b[32m✔\u001b[39m \u001b[34mpurrr    \u001b[39m 1.0.1     \n",
      "── \u001b[1mConflicts\u001b[22m ────────────────────────────────────────── tidyverse_conflicts() ──\n",
      "\u001b[31m✖\u001b[39m \u001b[34mdplyr\u001b[39m::\u001b[32mfilter()\u001b[39m masks \u001b[34mstats\u001b[39m::filter()\n",
      "\u001b[31m✖\u001b[39m \u001b[34mdplyr\u001b[39m::\u001b[32mlag()\u001b[39m    masks \u001b[34mstats\u001b[39m::lag()\n",
      "\u001b[36mℹ\u001b[39m Use the conflicted package (\u001b[3m\u001b[34m<http://conflicted.r-lib.org/>\u001b[39m\u001b[23m) to force all conflicts to become errors\n",
      "Warning message:\n",
      "\"package 'RColorBrewer' was built under R version 4.3.1\"\n",
      "Warning message:\n",
      "\"package 'tm' was built under R version 4.3.3\"\n",
      "Loading required package: NLP\n",
      "\n",
      "\n",
      "Attaching package: 'NLP'\n",
      "\n",
      "\n",
      "The following object is masked from 'package:ggplot2':\n",
      "\n",
      "    annotate\n",
      "\n",
      "\n",
      "\n",
      "Attaching package: 'gridExtra'\n",
      "\n",
      "\n",
      "The following object is masked from 'package:dplyr':\n",
      "\n",
      "    combine\n",
      "\n",
      "\n",
      "Warning message:\n",
      "\"package 'pastecs' was built under R version 4.3.3\"\n",
      "\n",
      "Attaching package: 'pastecs'\n",
      "\n",
      "\n",
      "The following objects are masked from 'package:dplyr':\n",
      "\n",
      "    first, last\n",
      "\n",
      "\n",
      "The following object is masked from 'package:tidyr':\n",
      "\n",
      "    extract\n",
      "\n",
      "\n",
      "Warning message:\n",
      "\"package 'quanteda' was built under R version 4.3.3\"\n",
      "Warning message in .recacheSubclasses(def@className, def, env):\n",
      "\"undefined subclass \"ndiMatrix\" of class \"replValueSp\"; definition not updated\"\n",
      "Warning message in .recacheSubclasses(def@className, def, env):\n",
      "\"undefined subclass \"pcorMatrix\" of class \"replValueSp\"; definition not updated\"\n",
      "Package version: 4.2.0\n",
      "Unicode version: 13.0\n",
      "ICU version: 69.1\n",
      "\n",
      "Parallel computing: 12 of 12 threads used.\n",
      "\n",
      "See https://quanteda.io for tutorials and examples.\n",
      "\n",
      "\n",
      "Attaching package: 'quanteda'\n",
      "\n",
      "\n",
      "The following object is masked from 'package:tm':\n",
      "\n",
      "    stopwords\n",
      "\n",
      "\n",
      "The following objects are masked from 'package:NLP':\n",
      "\n",
      "    meta, meta<-\n",
      "\n",
      "\n",
      "Warning message:\n",
      "\"package 'quanteda.textstats' was built under R version 4.3.3\"\n",
      "Warning message in .recacheSubclasses(def@className, def, env):\n",
      "\"undefined subclass \"ndiMatrix\" of class \"replValueSp\"; definition not updated\"\n",
      "Warning message in .recacheSubclasses(def@className, def, env):\n",
      "\"undefined subclass \"pcorMatrix\" of class \"replValueSp\"; definition not updated\"\n",
      "Warning message:\n",
      "\"package 'quanteda.textplots' was built under R version 4.3.3\"\n",
      "Warning message:\n",
      "\"package 'readtext' was built under R version 4.3.3\"\n",
      "\n",
      "Attaching package: 'readtext'\n",
      "\n",
      "\n",
      "The following object is masked from 'package:quanteda':\n",
      "\n",
      "    texts\n",
      "\n",
      "\n",
      "Warning message:\n",
      "\"package 'ggwordcloud' was built under R version 4.3.3\"\n",
      "Warning message:\n",
      "\"package 'extrafont' was built under R version 4.3.1\"\n",
      "Registering fonts with R\n",
      "\n",
      "Warning message:\n",
      "\"package 'ggpubr' was built under R version 4.3.3\"\n"
     ]
    }
   ],
   "source": [
    "# =====================================================\n",
    "# Required Package Installation (Uncomment if needed)\n",
    "# =====================================================\n",
    "# Uncomment the lines below if you need to install these packages.\n",
    "# install.packages(\"quanteda\")\n",
    "# install.packages(\"quanteda.textstats\")\n",
    "# install.packages(\"quanteda.textplots\")\n",
    "# install.packages(\"lubridate\")\n",
    "# install.packages(\"readtext\")\n",
    "# install.packages(\"corpus\")\n",
    "# install.packages(\"pastecs\")\n",
    "# install.packages(\"RColorBrewer\")\n",
    "# install.packages(\"tm\")\n",
    "# install.packages(\"ggwordcloud\")\n",
    "# install.packages(\"extrafont\")\n",
    "# install.packages(\"ggpubr\")\n",
    "\n",
    "# =====================================================\n",
    "# Load Required Libraries\n",
    "# =====================================================\n",
    "\n",
    "library(readxl)            # For reading Excel files\n",
    "library(tidyverse)         # Core tidyverse packages: dplyr, ggplot2, tidyr, etc.\n",
    "library(writexl)           # For writing Excel files\n",
    "library(RColorBrewer)      # Provides color palettes for visualizations\n",
    "library(tm)                # Text mining package\n",
    "library(gridExtra)         # For arranging grid-based plots\n",
    "library(pastecs)           # Descriptive statistics\n",
    "library(quanteda)          # Text analysis package\n",
    "library(quanteda.textstats)# Text statistics for quanteda objects\n",
    "library(quanteda.textplots)# Visualization tools for text data\n",
    "library(readtext)          # Efficient reading of text files (e.g., PDFs, .txt)\n",
    "library(SnowballC)         # Snowball stemmer for text preprocessing\n",
    "library(ggwordcloud)       # Word cloud visualizations using ggplot2\n",
    "library(extrafont)         # Additional fonts for publication-quality plots\n",
    "library(ggpubr)            # 'ggplot2'-based publication-ready plots\n",
    "library(readxl)\n",
    "library(stringr)\n",
    "library(dplyr)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "vscode": {
     "languageId": "r"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\u001b[1m\u001b[22mNew names:\n",
      "\u001b[36m•\u001b[39m `` -> `...1`\n",
      "\u001b[1m\u001b[22mNew names:\n",
      "\u001b[36m•\u001b[39m `` -> `...1`\n"
     ]
    }
   ],
   "source": [
    "# Load data\n",
    "citizens <- read_excel(\"data/citizens_keyness_analysis.xlsx\")\n",
    "councilors <- read_excel(\"data/councilors_keyness_analysis.xlsx\")\n",
    "\n",
    "# Define substitutions in a named vector\n",
    "replacements <- c(\n",
    "  \"1%\" = \"ενα_τις_εκατο\",\n",
    "  \"_πολυ\" = \"πολυ\",\n",
    "  \"α πολυ τως\" = \"απολυτως\",\n",
    "  \"διαβιωσεις\" = \"διαβιωσης\",\n",
    "  \"διαβιωσης\" = \"διαβιωση\",\n",
    "  \"διαβιωση\" = \"διαβιωσης\",\n",
    "  \"ελεγχεται\" = \"ελεγχομενη\",\n",
    "  \"παιδια\" = \"παιδι\",\n",
    "  \"ελεγχομενα\" = \"ελεγχομενη\",\n",
    "  \"μορφωσει\" = \"μορφωση\",\n",
    "  \"διασφαλιζει\" = \"διασφαλιζε\",\n",
    "  \"διασφαλιζε\" = \"διασφαλιζει\",\n",
    "  \"εισβολεας\" = \"εισβολεα\",\n",
    "  \"εισβολεα\" = \"εισβολεας\",\n",
    "  \"νησια\" = \"νησι\",\n",
    "  \"νησι\" = \"νησια\",\n",
    "  \"ανθρωπινα\" = \"ανθρωπινες\",\n",
    "  \"μουσουλμανοι\" = \"μουσουλμανο\",\n",
    "  \"μουσουλμανο\" = \"μουσουλμανοι\",\n",
    "  \"νομος\" = \"νομο\",\n",
    "  \"νομο\" = \"νομος\",\n",
    "  \"μορφωσε\" = \"μορφωση\",\n",
    "  \"περιθαλψει\" = \"περιθαλψη\",\n",
    "  \"κανονας\" = \"κανονα\",\n",
    "  \"κανονα\" = \"κανονας\",\n",
    "  \"τηρηθουν\" = \"τηρηση\",\n",
    "  \"α κλειστες\" = \"κλειστες\",\n",
    "  \"εξοδα\" = \"εξοδο\",\n",
    "  \"προσωρινες\" = \"προσωρινα\",\n",
    "  \"ποσοστο\" = \" % \",\n",
    "  \"γικλειστες\" = \"κλειστες\",\n",
    "  \"σηκωσει\" = \"σηκωνει\",\n",
    "  \"μικλειστες\" = \"κλειστες\",\n",
    "  \"απελασει\" = \"απελαση\",\n",
    "  \"εντασσει\" = \"ενταξη\",\n",
    "  \"τηρει\" = \"τηρηση\",\n",
    "  \"ενσωματωθει\" = \"ενσωματωση\",\n",
    "  \"εγκληματιας\" = \"εγκληματιες\",\n",
    "  \"βοηθα\" = \"βοηθεια\",\n",
    "  \"επιβαλλει\" = \"επιβαλλουν\",\n",
    "  \"αναγνωρισμενες\" = \"αναγνωρισμενοι\"\n",
    ")\n",
    "\n",
    "# Apply replacements using str_replace_all from stringr\n",
    "normalize_text <- function(text, replacements) {\n",
    "  str_replace_all(text, replacements)\n",
    "}\n",
    "\n",
    "# Apply to both datasets\n",
    "citizens <- citizens %>% mutate(cleaned = normalize_text(cleaned, replacements))\n",
    "councilors <- councilors %>% mutate(cleaned = normalize_text(cleaned, replacements))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "vscode": {
     "languageId": "r"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Warning message in eval(expr, envir, enclos):\n",
      "\"NAs introduced by coercion\"\n",
      "Warning message:\n",
      "\"NA is replaced by empty string\"\n",
      "Warning message:\n",
      "\"NA is replaced by empty string\"\n",
      "Warning message:\n",
      "\"NA is replaced by empty string\"\n"
     ]
    }
   ],
   "source": [
    "# Create label vectors for citizens and councilors\n",
    "citizen_labels <- c(rep('citizen', nrow(citizens)))\n",
    "councilors_labels <- c(rep('councilor', nrow(councilors)))\n",
    "\n",
    "# Select relevant columns from citizens and add label\n",
    "citizens_for_combined <- citizens %>% select('Anonymous_id', 'cleaned', 'Q26_edu', 'pol_orient', 'Q29_income', 'Q10_c_post', 'treat1')\n",
    "citizens_for_combined['citizen_or_councilor'] = citizen_labels\n",
    "\n",
    "# Select relevant columns from councilors and add label\n",
    "councilors_for_combined <- councilors %>% select('id_anonymous', 'cleaned', 'Q26_edu', 'pol_orient_x', 'Q29_income', 'Q10_c', 'treat1')\n",
    "councilors_for_combined['citizen_or_councilor'] = councilors_labels\n",
    "\n",
    "# Rename columns in councilors_for_combined to match citizens_for_combined\n",
    "colnames(councilors_for_combined) <- c('Anonymous_id', 'cleaned', 'Q26_edu', 'pol_orient', 'Q29_income', 'Q10_c_post', 'treat1', 'citizen_or_councilor')\n",
    "\n",
    "# Convert Q10_c_post to numeric for councilors\n",
    "councilors_for_combined$Q10_c_post <- as.numeric(as.character(councilors_for_combined$Q10_c_post))\n",
    "\n",
    "# Combine citizens and councilors data into one dataframe\n",
    "combined <- bind_rows(citizens_for_combined, councilors_for_combined)\n",
    "\n",
    "# Create quanteda corpus objects for councilors, citizens, and combined data\n",
    "corp_councilors <- corpus(c(councilors$cleaned))\n",
    "corp_citizens <- corpus(c(citizens$cleaned))\n",
    "corp_combined <- corpus(c(combined$cleaned))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-07-03T13:27:11.040127Z",
     "iopub.status.busy": "2022-07-03T13:27:11.038269Z",
     "iopub.status.idle": "2022-07-03T13:27:11.119699Z"
    },
    "vscode": {
     "languageId": "r"
    }
   },
   "outputs": [],
   "source": [
    "# Load Greek stopwords from Excel file\n",
    "stop_words <- read_excel(\"data/stopwords_GR.xlsx\", sheet = 'stopwords')\n",
    "stop_words <- c(stop_words$word)\n",
    "\n",
    "# Extend stopwords with additional Greek words and common terms\n",
    "stop_words <-  c(stop_words, \"μου\",\"των\",\"στις\",\"ως\",\"τετοια\",\"διοτι\",\"λογο\",\"ειχα\",\"μπορει\",\"μας\",\"μεσα\",\"μεσου\",\"και\",\"να\",\"τα\",\"με\",\"τον\",\"τους\",\n",
    "                     \"σε\",\"θα\",\"οι\",\"ειναι\",\"καθως\",\"στην\",\"στο\",\"γιατι\",\"επειδη\",\"αφου\",\"στα\",\"στον\",\"θεωρουμε\",\"δεν\",\"καμια\",\"δομ\",\"του\",\n",
    "                     \"ομως\",\"πληθυσμο\",\"της\",\"απο\",\"ισχυρα\",\"εχει\",\"ζησει\",\"οχι\",\"για\",\"αποτελει\",\"κυριως\",\"εχει\",\"εν\",\"κι\",\"την\",\"το\",\"κυριως\",\n",
    "                     \"εκτος\",\"επιπεδα\",\"οτι\",\"εδω\",\"υποστηριζα\",\"φυσικα\",\"σχετικα\",\"συμβαν\",\"εδω\",\"ειτε\",\"μια\",\"δε\",\"ας\",\"καθε\",\"μαζι\",\"ον\",\"στ\",\"τοπ\",\n",
    "                     \"κραταει\",\"θελαμε\",\"γινανε\",\"μονα\",\"θελει\",\"μεριας\",\"δεχεται\",\"κατοικει\",\"ερχεται\",\"ενταξει\",\"θεωρει\",\"καν\",\"χρειαζεστε\",\"προσωπα\",\"δινει\",\n",
    "                     \"ερθει\",\"υποστηριξει\",\"καταστασεις\",\"υπολοιπα\",\"βρισκανε\",\"φιλοξενουμενο\",\"πιστευει\",\"λογης\",\"γινει\",\"μονη\",\"υποστηριξα\",\"προσωπικα\",\"πιστευοντας\",\n",
    "                     \"χρειαζεται\",\"καταστασεων\",\"κανενα\",\"μερα\",\"λα\",\"θες\",\"κοσμικου\",\"αφορα\",\"κλπ\",\"αν\",\"στοιχεια\",\"πρεπει\",\"περιπτωσεις\",\"πολλα\",\"δικα\",\"ηρθα\",\"ουτε\",\n",
    "                     \"πρωτα\",\"λιγα\",\"τη\",\"επιπλεον\",\"διαφορετικα\",\"οπως\",\"οταν\",\"δευτερη\",\"κοσμο\",\"δημος\",\"πληρως\",\"δινεται\",\"ειμαστε\",\"της\",\"κατα\",\"καποια\",\"πραγμα\",\"τρ\",\n",
    "                     \"οποια\",\"φιλοξενει\",\"μεχρι\",\"τις\",\"εφοσον\",\"προκειμενου\",\"συστημα\",\"κεντρα\",\"αδυνατο\",\"κυριαρχει\",\"αιτησεις\",\"αυτην\",\"εως\",\"μονον\",\"λαβει\",\n",
    "                     \"ζηταγαν\",\"δυνατη\",\"ζηταγα\",\"ουσιαστικα\",\"τωρα\",\"συμβαινει\",\"προσπαθεια\",\"αυτα\",\"αλλες\",\"μεγαλες\",\"πληρη\",\"λυσει\",\"τελος\",\"ετσι\",\"δηλωση\",\"μην\",\"σημαντικα\",\"λαθος\",\"εννοια\",\n",
    "                     \"δυναμει\",\"ζω\",\"υπαρξει\",\"ετη\",\"θεωρειται\",\"ηταν\",\"εναντι\",\"ειδη\",\"εγκατασταθει\",\"οσο\",\"χρονια\",\"καταλληλα\",\"που\",\"τεκνα\",\"κερκυρα\",\"βασει\",\"ατομα\",\"εμπειρια\",\"εκει\",\"δηλαδη\",\n",
    "                     \"απολυτα\",\"ακομα\",\"φυγει\",\"νομιζω\",\"τουλαχιστον\",\"δεδομενα\",\"αλλα\",\"επιθυμει\",\"σχολη\",\"θετικα\",\"συνολικα\",\"μακρα\",\"εντελως\",\"καθολου\",\"πολιτες\",\"φθανει\",\"φευγουν\",\"υποδεχθει\",\"υπαρχει\",\n",
    "                        \"τυπου\",\"συναφη\",\"υ\",\"παει\",\"τροπο\",\"ελεγα\",\"ερωτηματολογιο\",\"πολεις\",\"δυο\",\"γενικα\",\"φιλοξενιας\",\"ς\",\"υς\",\"πηγαινα\",\"επιλογες\",\"ωστε\",\"διαδ\",\"ομορα\",\"μεν\",\"πανε\",\"ζει\",\"ορι\",\n",
    "                        \"θεμα\",\"φορα\",\"σωστα\",\"αριθμο\",\"ζητα\",\"ποσοστα\",\"μπαιναν\",\"υπηρχαν\",\"προσπαθει\",\"τυπικη\",\"σπιτι\",\"προσπαθει\",\"μες\",\"ξεραμε\",\"βραν\",\"μιλαει\",\"σχεσει\",\"μεινανε\",\"ενω\",\"χωρα\",\"προσφυγας\",\n",
    "                         \"μεταναστες\",\"προυποθεσεις\",\"διαμονη\",\"λειτουργει\",\"φερανε\",\"παρεχει\",\"αντιμετωπισει\",\"προκειται\",\"σκοπια\",\"νεα\",\"εγω\",\"εκτη\",\"ελευθερα\",\"δοθει\",\n",
    "                        \"στηριξει\",\"μαζα\",\"ιδια\",\"εισερχεται\",\"επρπε\",\"γνωριζα\",\"δυστυχως\",\"αιτουντα\",\"φυγανε\",\"υποδοχη\",\"διαφωνεις\",\"οντως\",\"τελει\",\"αντιθετρα\",\"παραμονες\",\"ιστο\",\"σιγουρα\",\"ερευνα\",\"παιρναμε\",\n",
    "                         \"πλεον\",\"μαθαμε\",\"αντιμετωπιζαν\",\"προκαλει\",\"συγκεκριμενα\",\"ασυλα\",\"ελαχιστα\",\"ειδα\",\"βλεπει\",\"απολυτως\",\"επιστρεψουμε\",\"ερωτησεις\",\"δρασεις\",\"ζητημα\",\"διαμενουν\",\"ροες\",\"προσφεραν\",\n",
    "                         \"πληρωναν\",\"φιλοξενουμενοι\",\"διαμενουν\",\"ελλαδα\",\"ελληνας\",\"περιοχες\",\"βασικα\",\"επρεπε\",\"στηριζα\",\"προερχεται\",\"ξενα\",\"αρχας\",\"δεχτει\",\"υποχρεωσεις\",\"βοηθεια\",\"περισσοτερα\",\"διαφορα\",\n",
    "                        \"ζωνες\",\"ελεγχαν\",\"χωρανε\",\"αρκετα\",\"κατοικια\",\"αμεσα\",\"ορο\",\"διαφωνω\",\"γνωριζει\",\"μεγαλα\",\"παρολα\",\"κτλ\",\"α\",\"'\",\"αρα\",\"τυχη\",\"πω\",\"φτιαχτε\",\"πραξει\",\"τυχει\",\"τριτα\",\"σειρα\",\"συνεχιζει\",\n",
    "                        \"ουτως\",\"ουσια\",\"οποιοδηποτε\",\"ομαδα\",\"λεγεται\",\"επιτρεπει\",\"λεξεις\",\"απαντες\",\"αντιστοιχα\",\"στειλει\",\"στοχο\",\"χαρακτηρα\",\"ντρεπεστε\",\"κομματι\",\"κυνηγαει\",\"ανθρωπο\",\"ετοιμο\",\"αποκτησει\",\n",
    "                        \"πλαισια\",\"αυξημενες\",\"αντιθετα\",\"κυριο\",\"καταρχας\",\"εξυπηρετησουν\",\"απεναντι\",\"πτυχιο\",\"συμπληρωσα\",\"σουβλιου\",\"επομενως\",\"ιδιαιτερα\",\"μεινει\",\"λοιπον\",\"μπει\",\"απαντησεις\",\"προτασεις\",\n",
    "                        \"ερωτημα\",\"ουδεν\",\"ακολουθει\",\"πολλου\",\"αλλαξει\",\"σημασια\",\"βιωνουμε\",\"αποτελειται\",\"αιτημα\",\"ωρα\",\"αφηνει\",\"παταει\",\"αποψεις\",\"απονομη\",\"αποδοση\",\"χρηζουν\",\"αυξαναν\",\"νιωθει\",\"οσ\",\"ος\",\n",
    "                        \"μον\",\"νε\",\"εχ\",\"λογ\",\"τες\",\"δι\",\"ις\",\"παν\",\"βλεπε\",\"πρεπε\",\"ξερα\",\"φεραν\",\"μιλα\",\"μερη\",\"λτ\",\"γτ\",\"σχεδον\",\"φευγανε\",\"βαζει\",\"τχα\",\"ες\",\"δνα\",\"δν\",\"απ\",\"δειχνει\",\"τελειως\",\"δει\",\"δομη\",\n",
    "                        \"δομες\",\"φτιαχτει\",\"γεματα\",\"κατασκευες\",\"περνα\",\"ακολουθανε\",\"οδηγει\",\"υποψη\",\"διαθετει\",\"δηθεν\",\"επικρατει\",\"ιαπωνια\",\"ευρυτερη\",\"παραπανω\")\n",
    "\n",
    "# Create bigram stopwords list (for n-gram removal), extended with the same terms\n",
    "stop_words_bigrams = c(stop_words, \"μου\",\"των\",\"στις\",\"ως\",\"τετοια\",\"διοτι\",\"λογο\",\"ειχα\",\"μπορει\",\"μας\",\"μεσα\",\"μεσου\",\"και\",\"να\",\"τα\",\"με\",\"τον\",\"τους\",\n",
    "                     \"σε\",\"θα\",\"οι\",\"ειναι\",\"καθως\",\"στην\",\"στο\",\"γιατι\",\"επειδη\",\"αφου\",\"στα\",\"στον\",\"θεωρουμε\",\"δεν\",\"καμια\",\"δομ\",\"του\",\n",
    "                     \"ομως\",\"πληθυσμο\",\"της\",\"απο\",\"ισχυρα\",\"εχει\",\"ζησει\",\"οχι\",\"για\",\"αποτελει\",\"κυριως\",\"εχει\",\"εν\",\"κι\",\"την\",\"το\",\"κυριως\",\n",
    "                     \"εκτος\",\"επιπεδα\",\"οτι\",\"εδω\",\"υποστηριζα\",\"φυσικα\",\"σχετικα\",\"συμβαν\",\"εδω\",\"ειτε\",\"μια\",\"δε\",\"ας\",\"καθε\",\"μαζι\",\"ον\",\"στ\",\"τοπ\",\n",
    "                     \"κραταει\",\"θελαμε\",\"γινανε\",\"μονα\",\"θελει\",\"μεριας\",\"δεχεται\",\"κατοικει\",\"ερχεται\",\"ενταξει\",\"θεωρει\",\"καν\",\"χρειαζεστε\",\"προσωπα\",\"δινει\",\n",
    "                     \"ερθει\",\"υποστηριξει\",\"καταστασεις\",\"υπολοιπα\",\"βρισκανε\",\"φιλοξενουμενο\",\"πιστευει\",\"λογης\",\"γινει\",\"μονη\",\"υποστηριξα\",\"προσωπικα\",\"πιστευοντας\",\n",
    "                     \"χρειαζεται\",\"καταστασεων\",\"κανενα\",\"μερα\",\"λα\",\"θες\",\"κοσμικου\",\"αφορα\",\"κλπ\",\"αν\",\"στοιχεια\",\"πρεπει\",\"περιπτωσεις\",\"πολλα\",\"δικα\",\"ηρθα\",\"ουτε\",\n",
    "                     \"πρωτα\",\"λιγα\",\"τη\",\"επιπλεον\",\"διαφορετικα\",\"οπως\",\"οταν\",\"δευτερη\",\"κοσμο\",\"δημος\",\"πληρως\",\"δινεται\",\"ειμαστε\",\"της\",\"κατα\",\"καποια\",\"πραγμα\",\"τρ\",\n",
    "                     \"οποια\",\"φιλοξενει\",\"μεχρι\",\"τις\",\"εφοσον\",\"προκειμενου\",\"συστημα\",\"κεντρα\",\"αδυνατο\",\"κυριαρχει\",\"αιτησεις\",\"αυτην\",\"εως\",\"μονον\",\"λαβει\",\n",
    "                     \"ζηταγαν\",\"δυνατη\",\"ζηταγα\",\"ουσιαστικα\",\"τωρα\",\"συμβαινει\",\"προσπαθεια\",\"αυτα\",\"αλλες\",\"μεγαλες\",\"πληρη\",\"λυσει\",\"τελος\",\"ετσι\",\"δηλωση\",\"μην\",\"σημαντικα\",\"λαθος\",\"εννοια\",\n",
    "                     \"δυναμει\",\"ζω\",\"υπαρξει\",\"ετη\",\"θεωρειται\",\"ηταν\",\"εναντι\",\"ειδη\",\"εγκατασταθει\",\"οσο\",\"χρονια\",\"καταλληλα\",\"που\",\"τεκνα\",\"κερκυρα\",\"βασει\",\"ατομα\",\"εμπειρια\",\"εκει\",\"δηλαδη\",\n",
    "                     \"απολυτα\",\"ακομα\",\"φυγει\",\"νομιζω\",\"τουλαχιστον\",\"δεδομενα\",\"αλλα\",\"επιθυμει\",\"σχολη\",\"θετικα\",\"συνολικα\",\"μακρα\",\"εντελως\",\"καθολου\",\"πολιτες\",\"φθανει\",\"φευγουν\",\"υποδεχθει\",\"υπαρχει\",\n",
    "                        \"τυπου\",\"συναφη\",\"υ\",\"παει\",\"τροπο\",\"ελεγα\",\"ερωτηματολογιο\",\"πολεις\",\"δυο\",\"γενικα\",\"φιλοξενιας\",\"ς\",\"υς\",\"πηγαινα\",\"επιλογες\",\"ωστε\",\"διαδ\",\"ομορα\",\"μεν\",\"πανε\",\"ζει\",\"ορι\",\n",
    "                        \"θεμα\",\"φορα\",\"σωστα\",\"αριθμο\",\"ζητα\",\"ποσοστα\",\"μπαιναν\",\"υπηρχαν\",\"προσπαθει\",\"τυπικη\",\"σπιτι\",\"προσπαθει\",\"μες\",\"ξεραμε\",\"βραν\",\"μιλαει\",\"σχεσει\",\"μεινανε\",\"ενω\")\n",
    "\n",
    "# List of specific bigram stopwords (custom phrases to remove)\n",
    "st = c(\"μουσουλμανοι_μουσουλμανοι\",\"αναλογα_αναλογα\",\"απεναντι_παιδια\",\"ανθρωπια_τοπικη\",\"ανθρωπια_τουρκια\",\"παιδια_ενταξη\",\n",
    "        \"κλειστες_ανθρωπια\",\"διαβιωση_σεβασμο\",\"ανθρωπια_ανθρωπια\",\"καθετα_αντιθετα\",\"ανθρωπια_κοινωνια\",\"εξετασει_ασυλο\",\"αιτημα_ασυλο\",\"χορηγηση_ασυλο\",\n",
    "        \"πραγματικος_λαθρο\",\"ομαλα_κοινωνια\",\"παροχες_ασυλο\",\"πραγματικος_εμπολεμη\",\"βοηθα_ανθρωπια\",\"πυλες_εισοδο\",\"5_%\",\"ανοικτη_κλειστες\",\n",
    "        \"λυνεται_προβλημα\",\"ανθρωπια_παρανομα\",\"προβλημα_προβλημα\",\"ασυλο_επιστρεφανε\",\"ανθρωπια_πραγματικος\",\"νομιμα_πυλες\",\"κοινωνια_οικονομικη\",\n",
    "        \"διαβιωση_ανθρωπια\",\"βοηθα_οικονομικοι\",\"κλειστες_ακατοικητα\",\"ανδρες_γυναικα\",\"στρατος_εκκλησια\",\"ανθρωπινες_αξιοπρεπεια\",\"πολεμος_οικονομικοι\",\n",
    "        \"μουσουλμανοι_μουσουλμανοι\",\"αναλογα_αναλογα\",\"λαθρο_λαθρο\",\"απεναντι_παιδια\",\"ανθρωπια_τοπικη\",\"ανθρωπια_τουρκια\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-07-03T13:27:11.146024Z",
     "iopub.status.busy": "2022-07-03T13:27:11.144291Z",
     "iopub.status.idle": "2022-07-03T13:27:12.444849Z"
    },
    "vscode": {
     "languageId": "r"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Warning message:\n",
      "\"NA is replaced by empty string\"\n"
     ]
    }
   ],
   "source": [
    "# Tokenize the cleaned text from citizens, removing stopwords, bigram stopwords, and custom bigram phrases\n",
    "toks <- citizens$cleaned %>% tokens %>% \n",
    "  tokens_remove(pattern = phrase(c(stop_words, st, stop_words_bigrams)), valuetype = 'fixed')\n",
    "\n",
    "# Further remove tokens that match stopwords, bigram stopwords, or custom bigram phrases\n",
    "toks_ngram <- tokens_select(toks, pattern = phrase(c(stop_words, st, stop_words_bigrams)), selection = \"remove\")\n",
    "\n",
    "# Create bigrams (n = 2) from the remaining tokens\n",
    "toks_ngram <- tokens_ngrams(toks_ngram, n = 2)\n",
    "\n",
    "# Remove any remaining stopwords, bigram stopwords, or custom bigram phrases from the bigrams and create a document-feature matrix\n",
    "dfmat <- \n",
    "  toks_ngram %>%\n",
    "  tokens_remove(c(stop_words, st, stop_words_bigrams)) %>%\n",
    "  dfm()\n",
    "\n",
    "# Calculate frequency statistics for the features in the document-feature matrix\n",
    "fr_1 <- textstat_frequency(dfmat)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-07-03T13:27:12.450338Z",
     "iopub.status.busy": "2022-07-03T13:27:12.448448Z",
     "iopub.status.idle": "2022-07-03T13:27:13.011788Z"
    },
    "vscode": {
     "languageId": "r"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Warning message:\n",
      "\"NA is replaced by empty string\"\n"
     ]
    }
   ],
   "source": [
    "# Tokenize the cleaned text from councilors, removing stopwords, bigram stopwords, and custom bigram phrases\n",
    "toks <- councilors$cleaned %>% tokens %>% \n",
    "  tokens_remove(pattern = phrase(c(stop_words, st, stop_words_bigrams)), valuetype = 'fixed')\n",
    "\n",
    "# Further remove tokens that match stopwords, bigram stopwords, or custom bigram phrases\n",
    "toks_ngram <- tokens_select(toks, pattern = phrase(c(stop_words, st, stop_words_bigrams)), selection = \"remove\")\n",
    "\n",
    "# Create bigrams (n = 2) from the remaining tokens\n",
    "toks_ngram <- tokens_ngrams(toks_ngram, n = 2)\n",
    "\n",
    "# Remove any remaining stopwords, bigram stopwords, or custom bigram phrases from the bigrams and create a document-feature matrix\n",
    "dfmat <- \n",
    "  toks_ngram %>%\n",
    "  tokens_remove(c(stop_words, st, stop_words_bigrams)) %>%\n",
    "  dfm()\n",
    "\n",
    "# Calculate frequency statistics for the features in the document-feature matrix\n",
    "fr_2 <- textstat_frequency(dfmat)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-07-03T13:27:13.094056Z",
     "iopub.status.busy": "2022-07-03T13:27:13.091277Z",
     "iopub.status.idle": "2022-07-03T13:27:14.171675Z"
    },
    "vscode": {
     "languageId": "r"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Warning message:\n",
      "\"NA is replaced by empty string\"\n"
     ]
    }
   ],
   "source": [
    "# Tokenize the cleaned text from citizens, removing bigram stopwords\n",
    "toks <- citizens$cleaned %>% tokens %>% \n",
    "  tokens_remove(pattern = phrase(stop_words_bigrams), valuetype = 'fixed')\n",
    "\n",
    "# Further remove tokens that match bigram stopwords\n",
    "toks_ngram <- tokens_select(toks, pattern = phrase(stop_words_bigrams), selection = \"remove\")\n",
    "\n",
    "# Create bigrams (n = 2) from the remaining tokens\n",
    "toks_ngram <- tokens_ngrams(toks_ngram, n = 2)\n",
    "\n",
    "# Remove any remaining stopwords or custom bigram phrases and create a document-feature matrix\n",
    "dfmat <- \n",
    "  toks_ngram %>%\n",
    "  tokens_remove(c(stop_words, st)) %>%\n",
    "  dfm()\n",
    "\n",
    "# Calculate frequency statistics for the features in the document-feature matrix\n",
    "fr_1 <- textstat_frequency(dfmat)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-07-03T13:27:14.178939Z",
     "iopub.status.busy": "2022-07-03T13:27:14.176812Z",
     "iopub.status.idle": "2022-07-03T13:27:14.592093Z"
    },
    "vscode": {
     "languageId": "r"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Warning message:\n",
      "\"NA is replaced by empty string\"\n"
     ]
    }
   ],
   "source": [
    "# Tokenize the cleaned text from councilors, removing bigram stopwords\n",
    "toks <- councilors$cleaned %>% tokens %>% \n",
    "  tokens_remove(pattern = phrase(stop_words_bigrams), valuetype = 'fixed')\n",
    "\n",
    "# Further remove tokens that match bigram stopwords\n",
    "toks_ngram <- tokens_select(toks, pattern = phrase(stop_words_bigrams), selection = \"remove\")\n",
    "\n",
    "# Create bigrams (n = 2) from the remaining tokens\n",
    "toks_ngram <- tokens_ngrams(toks_ngram, n = 2)\n",
    "\n",
    "# Remove any remaining stopwords or custom bigram phrases and create a document-feature matrix\n",
    "dfmat <- \n",
    "  toks_ngram %>%\n",
    "  tokens_remove(c(stop_words, st)) %>%\n",
    "  dfm()\n",
    "\n",
    "# Calculate frequency statistics for the features in the document-feature matrix\n",
    "fr_2 <- textstat_frequency(dfmat)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Political orientation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-07-03T13:27:21.806006Z",
     "iopub.status.busy": "2022-07-03T13:27:21.804012Z",
     "iopub.status.idle": "2022-07-03T13:27:21.894809Z"
    },
    "vscode": {
     "languageId": "r"
    }
   },
   "outputs": [],
   "source": [
    "# Assign political orientation labels based on pol_orient for citizens, councilors, and combined data\n",
    "citizens$left_or_right = with(citizens, ifelse(pol_orient >= 5, \"Right-wing citizens\", \"Left-wing citizens\"))\n",
    "councilors$left_or_right = with(councilors, ifelse(pol_orient_x >= 5, \"Right-wing councilors\", \"Left-wing councilors\"))\n",
    "combined$left_or_right = with(combined, ifelse(pol_orient >= 5, \"Δεξιοί\", \"Αριστεροί\"))\n",
    "\n",
    "# Assign education level labels for citizens, councilors, and combined data\n",
    "citizens$edu_level = with(citizens, ifelse(Q26_edu == \"Πτυχίο ΑΕΙ\"|Q26_edu == \"Μεταπτυχιακό ή/ και Διδακτορικό Δίπλωμα\", \"Πολίτες - Υψηλή εκπαίδευση\", \"Πολίτες - Χαμηλή εκπαίδευση\"))\n",
    "councilors$edu_level = with(councilors, ifelse(Q26_edu == \"Πτυχίο ΑΕΙ\"|Q26_edu == \"Μεταπτυχιακό ή/ και Διδακτορικό Δίπλωμα\", \"Δημοτικοί Σύμβουλοι - Υψηλή εκπαίδευση\", \"Δημοτικοί Σύμβουλοι - Χαμηλή εκπαίδευση\"))\n",
    "combined$edu_level = with(combined, ifelse(Q26_edu == \"Πτυχίο ΑΕΙ\"|Q26_edu == \"Μεταπτυχιακό ή/ και Διδακτορικό Δίπλωμα\", \"Υψηλή εκπαίδευση\", \"Χαμηλή εκπαίδευση\"))\n",
    "\n",
    "# Subset combined data for right-wing and left-wing, and assign group labels\n",
    "right_citizen_or_councilor = combined[combined$pol_orient >= 5,]\n",
    "right_citizen_or_councilor$citizen_or_councilor = with(right_citizen_or_councilor, ifelse(citizen_or_councilor == \"citizen\", \"Right-wing citizens\", \"Right-wing councilors\"))\n",
    "\n",
    "left_citizen_or_councilor = combined[combined$pol_orient < 5,]\n",
    "left_citizen_or_councilor$citizen_or_councilor = with(left_citizen_or_councilor, ifelse(citizen_or_councilor == \"citizen\", \"Left-wing citizens\", \"Left-wing councilors\"))\n",
    "\n",
    "# Add Greek labels for citizen/councilor in combined data\n",
    "combined$citizen_or_councilor_greek = with(combined, ifelse(citizen_or_councilor == \"citizen\", \"Citizens\", \"Councilors\"))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Figure C. 2: Word clouds of right-wing citizens and right-wing councilors "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-07-03T15:02:18.589760Z",
     "iopub.status.busy": "2022-07-03T15:02:18.587006Z",
     "iopub.status.idle": "2022-07-03T15:02:24.166589Z"
    },
    "vscode": {
     "languageId": "r"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Warning message:\n",
      "\"NA is replaced by empty string\"\n",
      "Warning message:\n",
      "\"NA is replaced by empty string\"\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<strong>png:</strong> 2"
      ],
      "text/latex": [
       "\\textbf{png:} 2"
      ],
      "text/markdown": [
       "**png:** 2"
      ],
      "text/plain": [
       "png \n",
       "  2 "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "set.seed(1234) # for reproducibility \n",
    "\n",
    "# Create a corpus for right-wing citizens using the 'cleaned' text\n",
    "corp_inaug_1 <- corpus(right_citizen_or_councilor[right_citizen_or_councilor$citizen_or_councilor == \"Right-wing citizens\",], text_field = \"cleaned\")\n",
    "docid_1 <- paste(right_citizen_or_councilor[right_citizen_or_councilor$citizen_or_councilor == \"Right-wing citizens\",]$citizen_or_councilor, sep = \" \")\n",
    "docnames(corp_inaug_1) <- docid_1\n",
    "\n",
    "# Tokenize, remove stopwords, and create ngrams (unigrams and bigrams)\n",
    "toks_1 <- corp_inaug_1 %>% tokens %>% \n",
    "  tokens_remove(pattern = phrase(stop_words), valuetype = 'fixed')\n",
    "toks_ngram_1 <- tokens_select(toks_1, pattern = phrase(stop_words_bigrams), selection = \"remove\")\n",
    "toks_ngram_1 <- tokens_ngrams(toks_ngram_1, n = 1:2)\n",
    "\n",
    "# Remove additional stopwords and custom bigram phrases, then create a document-feature matrix\n",
    "dfmat_1 <- \n",
    "  toks_ngram_1 %>%\n",
    "  tokens_remove(c(stop_words, st)) %>%\n",
    "  dfm()\n",
    "\n",
    "# Calculate frequency statistics for right-wing citizens\n",
    "fr_1 <- textstat_frequency(dfmat_1)\n",
    "\n",
    "# Replace feature names with English labels for plotting\n",
    "x = c('greek','pseudo','closed','asylum','society','real','child','war','economic','problem','europe','illegal','family','deportation','non_governmental_organisations','war','turkey','controlled',\n",
    "      'conditions','%','homeland','education','respected','jobs','law','integration','islands','right','language','money','legal','unfair','culture','customs','small','allowance','muslims','pay','syria',\n",
    "      'governments','morals','humane','european_union','schools','strict','needs','women','security','religion','income','dignity','culture','moral_customs','infrastructure','livelihood','army','one_of_a_hundred',\n",
    "      'job','joining','border','places','entitled','closed_controlled','national','greek_society','crime','papers','health','temporary','benefits','moment','origin','criminal','norm','conduct','christian',\n",
    "      'international','minors','safe','police','respect','heavy','management','afghanistan','proportions','space','capability','invader','organizations','criteria','greek_language','compliance',\n",
    "      'living_conditions','identity','mandatory','majorities','men')\n",
    "fr_1$feature[1:length(x)] = x\n",
    "\n",
    "# Create a corpus for right-wing councilors using the 'cleaned' text\n",
    "corp_inaug_2 <- corpus(right_citizen_or_councilor[right_citizen_or_councilor$citizen_or_councilor == \"Right-wing councilors\",], text_field = \"cleaned\")\n",
    "docid_2 <- paste(right_citizen_or_councilor[right_citizen_or_councilor$citizen_or_councilor == \"Right-wing councilors\",]$citizen_or_councilor, sep = \" \")\n",
    "docnames(corp_inaug_2) <- docid_2\n",
    "\n",
    "# Tokenize, remove stopwords, and create ngrams (unigrams and bigrams)\n",
    "toks_2 <- corp_inaug_2 %>% tokens %>% \n",
    "  tokens_remove(pattern = phrase(stop_words), valuetype = 'fixed')\n",
    "toks_ngram_2 <- tokens_select(toks_2, pattern = phrase(stop_words_bigrams), selection = \"remove\")\n",
    "toks_ngram_2 <- tokens_ngrams(toks_ngram_2, n = 1:2)\n",
    "\n",
    "# Remove additional stopwords and custom bigram phrases, then create a document-feature matrix\n",
    "dfmat_2 <- \n",
    "  toks_ngram_2 %>%\n",
    "  tokens_remove(c(stop_words, st)) %>%\n",
    "  dfm()\n",
    "\n",
    "# Calculate frequency statistics for right-wing councilors\n",
    "fr_2 <- textstat_frequency(dfmat_2)\n",
    "\n",
    "# Replace feature names with English labels for plotting\n",
    "x = c('society','local','closed','local_society','integration','Greek','asylum','problem','security','integration','economic','infrastructure','conditions','proportionate','one_hundredth','small',\n",
    "  'real','jobs','benefits','education','children','health','controls','family','governments','livelihood','muslims','schools','normal','suitable','needs','culture','border','europe','strict',\n",
    "  '%','plans','turkey','army','language','law','administration','unfair','criteria','exit','entrance','facilities','tourist','council','temporary','locations','permanent','pseudo','state','controlled',\n",
    "  'entry_exit','european_union','humane','reciprocating','money','compensatory','customs','syria','compliance','policing','islands','religion','integration_local','indigenous','territory','space',\n",
    "  'program','allocation','tourism','services','police','war','quotas','compensatory_benefits','unemployment','non_governmental_organisations','care','services','resources','remuneration',\n",
    "  'culture','measures','living_conditions','severe','legal','enormity','greek_society','ghettoization','production','self-governance','local_self-governance','open','lesson','dignity','reinforcements',\n",
    "  'assimilation','fair','relationships','shares','ethics','ethics_customs','organizations','community','Greek_language','raises','hirings','gradual','business','recruitment','necessary',\n",
    "  'construction','easy','coherence','responsibility','assimilated','strict_criteria','difficult','%_one_of_a_hundred','humanitarian','treatment','religious','work','psychological','paradigm')\n",
    "fr_2$feature[1:length(x)] = x\n",
    "\n",
    "# Set plot size\n",
    "options(repr.plot.width=10, repr.plot.height=15)\n",
    "\n",
    "# Create word cloud for right-wing citizens\n",
    "plot1 <- ggwordcloud(fr_1$feature, fr_1$frequency, scale = c(4, 1),\n",
    "  max.words = 100, random.order = F, random.color = FALSE,\n",
    "  rot.per = 0, colors = \"black\", ordered.colors = FALSE, shape = 'circle') +\n",
    "  theme(plot.title = element_text(hjust = 0.5, size = 40, vjust=-2)) +\n",
    "  ggtitle(\"Right-wing citizens\")\n",
    "\n",
    "# Create word cloud for right-wing councilors\n",
    "plot2 <- ggwordcloud(fr_2$feature, fr_2$frequency, scale = c(4, 1),\n",
    "  max.words = 100, random.order = F, random.color = FALSE,\n",
    "  rot.per = 0, colors = \"black\", ordered.colors = FALSE, shape = 'circle') +\n",
    "  theme(plot.title = element_text(hjust = 0.5, size = 40, vjust=-4)) +\n",
    "  ggtitle(\"Right-wing councilors\")\n",
    "\n",
    "png(\"../figures/Figure C2.png\", width = 2400, height = 3600, res = 200)\n",
    "\n",
    "# Arrange both plots in a grid, one above the other\n",
    "grid.arrange(plot1, plot2, nrow=2)\n",
    "\n",
    "dev.off()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Figure C. 3: Word clouds of left-wing citizens and left-wing councilors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "vscode": {
     "languageId": "r"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Warning message:\n",
      "\"NA is replaced by empty string\"\n",
      "Warning message:\n",
      "\"NA is replaced by empty string\"\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<strong>png:</strong> 2"
      ],
      "text/latex": [
       "\\textbf{png:} 2"
      ],
      "text/markdown": [
       "**png:** 2"
      ],
      "text/plain": [
       "png \n",
       "  2 "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "set.seed(1234) # for reproducibility \n",
    "\n",
    "corp_inaug_1 <- corpus(left_citizen_or_councilor[left_citizen_or_councilor$citizen_or_councilor == \"Left-wing citizens\",], text_field = \"cleaned\")\n",
    "docid_1 <- paste(left_citizen_or_councilor[left_citizen_or_councilor$citizen_or_councilor == \"Left-wing citizens\",]$citizen_or_councilor, sep = \" \")\n",
    "docnames(corp_inaug_1) <- docid_1\n",
    "\n",
    "\n",
    "\n",
    "toks_1 <- corp_inaug_1 %>% tokens %>% \n",
    "  tokens_remove(pattern = phrase(stop_words), valuetype = 'fixed')\n",
    "\n",
    "toks_ngram_1 <- tokens_select(toks_1, pattern = phrase(stop_words_bigrams), selection = \"remove\")\n",
    "\n",
    "toks_ngram_1 <- tokens_ngrams(toks_ngram_1, n = 1:2)\n",
    "\n",
    "dfmat_1 <- \n",
    "toks_ngram_1 %>%\n",
    "tokens_remove(c(stop_words,st)) %>%\n",
    "dfm()\n",
    "\n",
    "\n",
    "fr_1 <- textstat_frequency(dfmat_1)\n",
    "\n",
    "x = c('society','humane','living conditions','dignity','Greek','education','child','asylum','living_conditions','economic','problem','right','integration','work','needs','school','open','infrastructure',\n",
    "      'closed','health','family','europe','human_conditions','money','places','possibility','culture','real','small','language','unfair','security','european_union','care','program','governments','respect',\n",
    "      'necessities','pay','proper','management','join','war','dignity_conditions','churches','international','input','education','rule','greek_society','open','proportionate','community','controlled','law',\n",
    "      'access','organizations','non_governmental_organizations','human_rights','benefits','war','women','dignity_of_living','web','solidarity','integration_society','exit','normal','religion','difficult','kids_school','papers','police',\n",
    "      'organized','medical','resources','facilities','culture','benefits','reinforcements','work','update','plans','fear','medical','stealth','apartment','moment','permanent','racists','minors','transparency',\n",
    "      'forms','camps','organization','responsible','integration_society','observance','entry_exit','customs','homeland','responsibilities','services','medical_care','illegal','education','prisons','assimilated',\n",
    "      'funds','psychological','ought','secured','opportunities','care','learning','%','state','adults','healthy','urban','respect','morals','abide','participated','gave','covered','secured','villages','islands')\n",
    "\n",
    "fr_1$feature[1:length(x)] = x\n",
    "\n",
    "\n",
    "\n",
    "corp_inaug_2 <- corpus(left_citizen_or_councilor[left_citizen_or_councilor$citizen_or_councilor == \"Left-wing councilors\",], text_field = \"cleaned\")\n",
    "docid_2 <- paste(left_citizen_or_councilor[left_citizen_or_councilor$citizen_or_councilor == \"Left-wing councilors\",]$citizen_or_councilor, sep = \" \")\n",
    "docnames(corp_inaug_2) <- docid_2\n",
    "\n",
    "\n",
    "\n",
    "toks_2 <- corp_inaug_2 %>% tokens %>% \n",
    "  tokens_remove(pattern = phrase(stop_words), valuetype = 'fixed')\n",
    "\n",
    "toks_ngram_2 <- tokens_select(toks_2, pattern = phrase(stop_words_bigrams), selection = \"remove\")\n",
    "\n",
    "toks_ngram_2 <- tokens_ngrams(toks_ngram_2, n = 1:2)\n",
    "\n",
    "dfmat_2 <- \n",
    "toks_ngram_2 %>%\n",
    "tokens_remove(c(stop_words,st)) %>%\n",
    "dfm()\n",
    "\n",
    "fr_2 <- textstat_frequency(dfmat_2)\n",
    "\n",
    "x = c('society','local','conditions','integration','humane','local_society','security','economics','livelihood','open','state','greek','schools','education','governments','dignity','respective','child',\n",
    "      'jobs designs','benefits','human_conditions','suitable','sanitary','health','european_union','seriously','recruitment','facilities','management','acceptance','needs','europe','compensatory',\n",
    "      'maintenance','organized','smooth','reliable','services','employment','dignity_conditions','responsibilities','professional','capacity','essentials','records','stay','reinforcements','rule','care',\n",
    "      'integration_local','unit','integration_society','criteria','positions','measures','custody','easy','military','permanent','urban','gradual','assimilation','law','houses','negative','ghettoization',\n",
    "      'difficult','basic','residents','compensating_benefits','culture','controlled','sanitary','program','care','respect_humanity','regular_integration','safe','budget','real','dignity_of_living','benefits',\n",
    "      'learning','language','ethics','customs','locales','ethics','customs','ghetto','reactions','villages','surveillance','churches','hospitals','organizations','inform','non_governmental_organizations',\n",
    "      'mandated','temporary','legal','implemented','overcome','adequate','poverty','state','doctors','enormous','distances','rewarding','rewarding_benefits','secures','barter','burden','war','unaccompanied',\n",
    "      'overcome','sufficient','poverty','state','doctors','enormity','distances','rewarding','rewarding_benefits','ensure','exchanges','burden','war','unaccompanied','unaccompanied_child','national','crisis',\n",
    "      'proportionality','region','profile','plans_join','logic','find','scientists','host','group','construction','construction_infrastructure','rural','symbol','suitable_infrastructure','health_conditions',\n",
    "      'council','piece')\n",
    "\n",
    "fr_2$feature[1:length(x)] = x\n",
    "\n",
    "\n",
    "options(repr.plot.width=10, repr.plot.height=15)\n",
    "\n",
    "plot1 <- ggwordcloud(fr_1$feature, fr_1$frequency, scale = c(4, 1),\n",
    "max.words = 100, random.order = F, random.color = FALSE,\n",
    "rot.per = 0, colors = \"black\", ordered.colors = FALSE, shape = 'circle') + theme(plot.title = element_text(hjust = 0.5, size = 40, vjust=-4)) + ggtitle(\"Left-wing citizens\")\n",
    "\n",
    "plot2 <- ggwordcloud(fr_2$feature, fr_2$frequency, scale = c(4, 1),\n",
    "max.words = 100, random.order = F, random.color = FALSE,\n",
    "rot.per = 0, colors = \"black\", ordered.colors = FALSE, shape = 'circle') + theme(plot.title = element_text(hjust = 0.5, size = 40, vjust=-3)) + ggtitle(\"Left-wing councilors\")\n",
    "\n",
    "png(\"../figures/Figure C3.png\", width = 2400, height = 3600, res = 200)\n",
    "\n",
    "grid.arrange(plot1, plot2, nrow=2)\n",
    "\n",
    "dev.off()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Figure 6: Keyness analysis on citizens, by political ideology"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-07-03T14:53:43.817858Z",
     "iopub.status.busy": "2022-07-03T14:53:43.816000Z",
     "iopub.status.idle": "2022-07-03T14:53:45.539686Z"
    },
    "vscode": {
     "languageId": "r"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Warning message:\n",
      "\"NA is replaced by empty string\"\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<strong>png:</strong> 2"
      ],
      "text/latex": [
       "\\textbf{png:} 2"
      ],
      "text/markdown": [
       "**png:** 2"
      ],
      "text/plain": [
       "png \n",
       "  2 "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# Set seed for reproducibility\n",
    "set.seed(1234)\n",
    "\n",
    "# Create a text corpus from the 'citizens' data frame, using the 'cleaned' column as the text\n",
    "corp_inaug <- corpus(citizens, text_field = \"cleaned\")\n",
    "\n",
    "# Create document IDs based on political orientation (left or right)\n",
    "docid <- paste(citizens$left_or_right, sep = \" \")\n",
    "docnames(corp_inaug) <- docid\n",
    "\n",
    "# Tokenize the corpus and remove stop words using exact matching\n",
    "toks <- corp_inaug %>%\n",
    "  tokens() %>%\n",
    "  tokens_remove(pattern = phrase(stop_words), valuetype = 'fixed')\n",
    "\n",
    "# Further clean tokens by removing stop words again (may be redundant)\n",
    "toks_ngram <- tokens_select(toks, pattern = phrase(stop_words), selection = \"remove\")\n",
    "\n",
    "# Create unigrams and bigrams from the tokens\n",
    "toks_ngram <- tokens_ngrams(toks_ngram, n = 1:2)\n",
    "\n",
    "# Create a document-feature matrix (DFM) from the n-gram tokens\n",
    "dfmat <- dfm(toks_ngram)\n",
    "\n",
    "# Create a DFM from tokens after removing stop words again, along with a custom list `st`\n",
    "dfmat1 <- toks_ngram %>%\n",
    "  tokens_remove(c(stop_words, st)) %>%\n",
    "  dfm()\n",
    "\n",
    "# Trim the DFM to keep only features that appear in at least 3 documents\n",
    "dfmat1 <- dfm_trim(\n",
    "  dfmat1,\n",
    "  min_docfreq = 3,\n",
    "  docfreq_type = \"count\"\n",
    ")\n",
    "\n",
    "# Group the DFM by the political orientation (left or right)\n",
    "dfmat1 <- dfm_group(dfmat1, groups = dfmat1$left_or_right)\n",
    "\n",
    "# Calculate keyness statistics using the log-likelihood ratio, with \"Right-wing citizens\" as the target\n",
    "tstat1 <- quanteda.textstats::textstat_keyness(dfmat1, measure = \"lr\", target = \"Right-wing citizens\")\n",
    "\n",
    "# Replace the top N features (equal to length of vector `x`) with manually defined terms (likely manually selected keywords)\n",
    "x = c('pseudo','deport','closed','illegal','subsidy','turkey','real','syria','legal','%','closed_controlled','respected','homeland','non_governmental_organizations','greek','muslim','islands',\n",
    "      'invader','war','strict','entitled','embassy','controlled','national','afghanistan','uninhabited_islands','europe','uninhabited','islam','ethics','origin','customs','customs','interests',\n",
    "      'entitle_asylum','returns','border','questions','alteration','men','east','indigenous','enforce','unchecked','law','distances','cost','real_war','asylum','asylum_practical','say','irregularities',\n",
    "      'christians','indifference','armenia','incoming','examine','one_of_a_hundred','army','deportation_sneak','deterrence','left','intentions','clear','steps','classes','crime','safe','primary','identity',\n",
    "      'athens','demands','proved','destroy','china','attitudes','family','otherwise','africa','inflows','impose','orthodox','fanatics','nowhere','majorities','cares','pakistan','tolerated','persecuted',\n",
    "      'indoors','colonization','islamists','pseudo_invader','pseudo_pious','legally_wrong','believes','standards','somalia','pious','religion','turned','bothers','re-promotions','says','overcome',\n",
    "      'traditions','endangered','attitudes','pakistan','delinquent_behavior','points','mandatory','mandatory','violent','germany','respected','fact','declared','mercy','wealth','replacement','distant',\n",
    "      'rape','france','custom_religion','greek_greek','exceptions')\n",
    "\n",
    "# Assign these manually curated feature names to the first N rows of the keyness results\n",
    "tstat1$feature[1:length(x)] = x\n",
    "\n",
    "# Define a second list of manually selected features for the opposite class (presumably left-wing)\n",
    "x = c('assimilated','medical_care','psychological','education','human_rights','schools','medical','needs','open','community','apartment','health','care','transparency','information','possibility',\n",
    "      'churches','appropriate','necessities','program','dignity_conditions','education','human_conditions','society','open','conditions','living_conditions','dignity','humane','livelihood','destroyed')\n",
    "\n",
    "# Replace feature names in the specified range with these new values\n",
    "# Note: Indexing assumes these are the rows related to the other class\n",
    "tstat1[2559:2589,]$feature = x\n",
    "\n",
    "# Set plot dimensions\n",
    "options(repr.plot.width = 12, repr.plot.height = 14)\n",
    "\n",
    "png(\"../figures/Figure 6.png\", width = 2400, height = 3600, res = 200)\n",
    "\n",
    "# Plot keyness statistics, highlighting the most distinctive terms between groups\n",
    "textplot_keyness(tstat1, labelsize = 6, color = c('blue', 'red'), margin = 0.17, n = 25) +\n",
    "  theme(\n",
    "    legend.key.height = unit(1, 'cm'),\n",
    "    legend.key.width = unit(1, 'cm'),\n",
    "    legend.text = element_text(size = 14),\n",
    "    \n",
    "    # Customize the plot background and axis appearance\n",
    "    axis.line = element_blank(),\n",
    "    panel.grid.major = element_blank(),\n",
    "    panel.grid.minor = element_line(),\n",
    "    panel.border = element_blank(),\n",
    "    panel.background = element_blank()\n",
    "  )\n",
    "\n",
    "dev.off()\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "R (hbs_env)",
   "language": "R",
   "name": "hbs_env"
  },
  "language_info": {
   "codemirror_mode": "r",
   "file_extension": ".r",
   "mimetype": "text/x-r-source",
   "name": "R",
   "pygments_lexer": "r",
   "version": "4.3.0"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
